import sys
sys.path.append('../utils/')
from ImageUtils import *
import numpy as np
import pandas as pd # Needs the package Pandas to be installed. Check Anaconda Environments and Packages.
from sklearn.decomposition import PCA # Needs SciKit Learn package to be installed. Check Anaconda Environments and Packages.4
from sklearn.covariance import LedoitWolf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score
import matplotlib.pyplot as plt
from scipy.spatial.distance import mahalanobis
from collections import Counter
from sklearn.preprocessing import label_binarize
import time
from sklearn import preprocessing
import ipywidgets as widgets
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from mpl_toolkits import mplot3d
faces94_male = readFaces94MaleFaces(gray=True)
faces94_female = readFaces94FemaleFaces(gray=True)
faces94_malestaff = readFaces94MaleStaffFaces(gray=True)
dataset = np.vstack((faces94_male, faces94_female, faces94_malestaff))
dataset_N, height, width = dataset.shape
dataset.shape
mean_all = np.mean(dataset.reshape(dataset_N, height*width), axis=0).reshape(height, width)
plt.imshow(mean_all, plt.cm.gray)
data=dataset.reshape(dataset_N, height*width) - np.mean(dataset.reshape(dataset_N, height*width), axis=0)
datasetmean=(1/(dataset_N-1))*(np.dot(data,data.T)) # Covariance matrix
print(datasetmean.shape)
u,s,vh = np.linalg.svd(datasetmean) # u: eigenvectors in columns; s: eigenvalues; vh = eigenvectors in rows
representation_percentage = 0.85 # Selected variability
sum_eig = np.sum(s)
percentage_variance = np.divide(s, sum_eig)
sum_var = 0
num_var = 0
for i in np.arange(percentage_variance.shape[0]):
if sum_var >= representation_percentage:
num_var = i
break;
sum_var += percentage_variance[i]
num_var_select=num_var
print("Principal components number: ",num_var_select)
print("Percent of variability captured: ",sum_var*100)
print("Images in datasets",dataset_N)
cum_per=np.cumsum(percentage_variance)
for i in range(1,len(s)):
change=(cum_per[i]-cum_per[i-1])/cum_per[i-1]*100
if(change<.01):
num_var=i-1
print("First",num_var, "components with ",cum_per[num_var]*100,"percent of variability captured and from which the contribution is less than 0.01%")
break
plt.figure(figsize=(12,6))
plt.plot(cum_per*100)
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Cumulative Summation of the Explained Variance')
plt.show()
EigenvectorsA=np.dot(data.T,u[:,0:num_var_select])
NormEigenvectorsA = preprocessing.normalize(EigenvectorsA,axis=0, norm='l2')
print(np.linalg.norm(NormEigenvectorsA[:,5],ord=None))#check normalizacion vectores propios de XT.X
cols = 4
rows = 4
plt.figure(figsize=(30,20))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.imshow(NormEigenvectorsA[:,i].reshape(height, width), plt.cm.gray)
start=0.8
step=0.06
stop=1
facespace(percentage_variance,dataset,data,mean_all,u,dataset_N,height,width,start,step,stop,0)
print("Principal components number: ",num_var_select)
print("Percent of variability captured: ",sum_var*100)
print("Images in datasets",dataset_N)
print("Omega matrix facespace",np.dot(data,NormEigenvectorsA).shape)
import ipywidgets as widgets
n=widgets.BoundedFloatText(value=2690,min=0,max=dataset_N,description='image:')
display(n)
N_image=int(n.value)
specificimage(data,dataset,NormEigenvectorsA,mean_all,N_image,dataset_N,height,width)
randomimage(data,dataset,NormEigenvectorsA,mean_all,dataset_N,height,width)
dataReconstructed=np.dot(np.dot(data,NormEigenvectorsA),NormEigenvectorsA.T)+mean_all.reshape(height*width)
print(dataReconstructed.shape)
Norm=widgets.Dropdown(options=['1', '2', 'inf'],value='2',description='Norm:',disabled=False)
display(Norm)
if str(Norm.value)=='inf':
ordn=np.inf
else:
ordn=int(Norm.value)
edistance = np.linalg.norm(np.subtract(dataReconstructed, dataset.reshape(dataset_N, height*width)), ord=ordn, axis=1)
print(edistance.shape)
histbox(edistance)
threshold, outliers, zsort, indexsort, z=outlierseigenfaces(edistance,3)
print('Outliers threshold method=',np.size(outliers))
print('threshold=',threshold)
CVresult={'outliers distance':outliers,'z':zsort}
df = pd.DataFrame(CVresult)
df.sort_values('z', axis = 0, ascending = False, inplace = True, na_position ='first')
df.head(np.size(outliers))
fig = plt.figure(figsize=(8,10))
ax1 = fig.add_subplot(1,2,1)
plt.title("Similar Image")
ax1.imshow(dataset[indexsort[0]], plt.cm.gray)
ax2 = fig.add_subplot(1,2,2)
plt.title("Dissimilar Image")
ax2.imshow(dataset[indexsort[-1]], plt.cm.gray)
cols = 4
rows = 2
plt.figure(figsize=(25,15))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("z "+str(z[indexsort[-(i+1)]]),fontsize=20)
plt.imshow(dataset[indexsort[-(i+1)]], plt.cm.gray)
landscapes = np.array(readLandsCapeImage(gray=True))
landimages(landscapes,height,width,mean_all,NormEigenvectorsA,ordn,outliers)
landimage=landscapes.reshape(landscapes.shape[0],height*width)-mean_all.reshape(height*width)
dataReconstructedland=np.dot(np.dot(landimage,NormEigenvectorsA),NormEigenvectorsA.T)+mean_all.reshape(height*width)
print(dataReconstructedland.shape)
edistanceland = np.linalg.norm(np.subtract(dataReconstructedland, landscapes.reshape(landscapes.shape[0], height*width)), ord=ordn, axis=1)
totaldistance=np.append(edistance,edistanceland)
histbox(totaldistance)
y_true=np.ones(dataset_N)
y_true=np.append(y_true,np.zeros(landscapes.shape[0]))
y_pred=(totaldistance<=outliers[0])*1
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print('TP=', tp,'TN=',tn,'FP=',fp,'FN=', fn)
print('accuracy= ', (tp+tn)/(tp+tn+fp+fn))
plt.figure()
plt.title("Heatmap")
prediction_data = {'y_Actual': y_true,'y_Predicted': y_pred}
df = pd.DataFrame(prediction_data, columns=['y_Actual','y_Predicted'])
confusionmatrix1 = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
ax=sns.heatmap(confusionmatrix1, annot=True,cmap='Blues', fmt='.0f');
ax.invert_yaxis()
ax.invert_xaxis()
N_land= int(np.where(edistanceland < outliers[0])[0][3])
landimage=landscapes[N_land].reshape(height*width)-mean_all.reshape(height*width)#seleccionar imagen individual
wland=np.dot(landimage,NormEigenvectorsA)#pesos w de cada Eigenface en subespacio generado
Reconstland=np.dot(wland,NormEigenvectorsA.T)+mean_all.reshape(height*width)#es mas claro w*vectores propios transpuestos
fig = plt.figure(figsize=(8,10))
ax1 = fig.add_subplot(1,2,1)
plt.title("Land image")
ax1.imshow(landscapes[N_land], plt.cm.gray)
ax2 = fig.add_subplot(1,2,2)
plt.title("Reconstructed land Image")
ax2.imshow(Reconstland.reshape(height, width), plt.cm.gray)
print('distancia',edistanceland[N_land])
accuracy, tncv, fpcv, fncv, tpcv=kfold(y_true,landscapes,dataset,height,width,ordn)
CVresult={'accuracy':accuracy,'tn':tncv,'fp':fpcv,'fn':fncv,'tp':tpcv}
df = pd.DataFrame(CVresult)
df.head()
landscapes = np.array(readLandsCapeImage(gray=True))
datasetfull = np.vstack((dataset,landscapes))
datasetfull_N, height, width = datasetfull.shape
datasetfull.shape
mean_all_full = np.mean(datasetfull.reshape(datasetfull_N, height*width), axis=0).reshape(height, width)
plt.imshow(mean_all_full, plt.cm.gray)
datafull=datasetfull.reshape(datasetfull_N, height*width) - np.mean(datasetfull.reshape(datasetfull_N, height*width), axis=0)
datasetmeanfull=(1/(datasetfull_N-1))*(np.dot(datafull,datafull.T))
print(datasetmeanfull.shape)
u,s,vh = np.linalg.svd(datasetmeanfull)
representation_percentage = 0.80
sum_eig = np.sum(s)
percentage_variance = np.divide(s, sum_eig)
sum_var = 0
num_var = 0
for i in np.arange(percentage_variance.shape[0]):
if sum_var >= representation_percentage:
num_var = i
break;
sum_var += percentage_variance[i]
num_var_select=num_var
print("Principal components number: ",num_var_select)
print("Percent of variability captured: ",sum_var*100)
print("Images in datasets",datasetfull_N)
cum_per=np.cumsum(percentage_variance)
for i in range(1,len(s)):
change=(cum_per[i]-cum_per[i-1])/cum_per[i-1]*100
if(change<.01):
num_var=i-1
print("First",num_var, "components with ",cum_per[num_var]*100,"percent of variability captured and from which the contribution is less than 0.01%")
break
plt.figure(figsize=(12,6))
plt.plot(cum_per*100)
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Cumulative Summation of the Explained Variance')
plt.show()
EigenvectorsA=np.dot(datafull.T,u[:,0:num_var_select])
NormEigenvectorsA = preprocessing.normalize(EigenvectorsA,axis=0, norm='l2')
print(np.linalg.norm(NormEigenvectorsA[:,5],ord=None))#check normalizacion vectores propios de XT.X
start=0.8
step=0.06
stop=1
facespace(percentage_variance,datasetfull,datafull,mean_all_full,u,datasetfull_N,height,width,start,step,stop,dataset_N)
print("Principal components number: ",num_var_select)
print("Percent of variability captured: ",sum_var*100)
print("Images in datasets",datasetfull_N)
print("Omega matrix facespace",np.dot(datafull,NormEigenvectorsA).shape)
N_image= np.random.randint(3059, high=datasetfull.shape[0], size=1)[0]
Image=datafull[N_image]
w=np.dot(Image,NormEigenvectorsA)
Reconstructed=np.dot(w,NormEigenvectorsA.T)+mean_all_full.reshape(height*width)
example_image = Reconstructed
original_image = datasetfull[N_image]
fig = plt.figure(figsize=(8,10))
ax1 = fig.add_subplot(1,2,1)
plt.title("Original Image")
ax1.imshow(original_image, plt.cm.gray)
ax2 = fig.add_subplot(1,2,2)
plt.title("Reconstructed Image")
ax2.imshow(example_image.reshape(height,width), plt.cm.gray)
datafullReconstructed=np.dot(np.dot(datafull,NormEigenvectorsA),NormEigenvectorsA.T)+mean_all_full.reshape(height*width)
print(datafullReconstructed.shape)
edistancefull = np.linalg.norm(np.subtract(datafullReconstructed, datasetfull.reshape(datasetfull_N, height*width)), ord=ordn, axis=1)
print(edistancefull.shape)
histbox(edistancefull)
z = np.abs(stats.zscore(edistancefull))
percentil_93, percentil_94 = np.percentile(edistancefull, [93, 94])
outliersindex=np.where(edistancefull > percentil_93)
outliersfull=edistancefull[outliersindex]
zsort=z[outliersindex]
indexsortout=np.argsort(outliersfull)
outliersfull=outliersfull[indexsortout]
zsort=zsort[indexsortout]
indexsort=np.argsort(edistancefull) #indice de imagenes distance de menor a mayor
edistancesort=edistancefull[indexsort] #distancias de imagenes menor a mayor
print('Outliers threshold method=',np.size(outliersfull))
print('percentil_93=',percentil_93)
#print('Q1= ',quartile_1, 'Q3= ',quartile_3)
CVresult={'outliers distancefull':outliersfull,'z':zsort}
df = pd.DataFrame(CVresult)
df.sort_values('z', axis = 0, ascending = False, inplace = True, na_position ='first')
df.head(10)
cols = 4
rows = 2
plt.figure(figsize=(25,15))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("z "+str(z[indexsort[-(i+1)]]),fontsize=20)
plt.imshow(datasetfull[indexsort[-(i+1)]], plt.cm.gray)
y_true=np.ones(dataset_N)
y_true=np.append(y_true,np.zeros(landscapes.shape[0]))
y_pred=(edistancefull<=percentil_93)*1
#y_pred=(z <= threshold)*1
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print('TP=', tp,'TN=',tn,'FP=',fp,'FN=', fn)
print('accuracy= ', (tp+tn)/(tp+tn+fp+fn))
plt.figure()
plt.title("Heatmap")
prediction_data = {'y_Actual': y_true,'y_Predicted': y_pred}
df = pd.DataFrame(prediction_data, columns=['y_Actual','y_Predicted'])
confusionmatrix1 = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
ax=sns.heatmap(confusionmatrix1, annot=True,cmap='Blues', fmt='.0f');
ax.invert_yaxis()
ax.invert_xaxis()
print("Principal components number: ",num_var_select)
print("Percent of variability captured: ",sum_var*100)
print("Images in datasets",datasetfull_N)
print("Omega matrix facespace",np.dot(datafull,NormEigenvectorsA).shape)
omegaw=np.dot(datafull,NormEigenvectorsA)
print(omegaw.shape)
kmeans = KMeans(n_clusters=3, random_state=42).fit(omegaw)
wcentroids=kmeans.cluster_centers_
wcentroids.shape
cols = 3
rows = 1
plt.figure(figsize=(12,8))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class "+str(i+1))
plt.imshow((np.dot(kmeans.cluster_centers_[i],NormEigenvectorsA.T)+mean_all_full.reshape(height*width)).reshape(height, width), plt.cm.gray)
y_label=kmeans.labels_
wtotaldist=kmeans.transform(omegaw)
wdistances = np.amin(wtotaldist, axis=1)
print(wdistances.shape[0])
kclass=0
print("Number images: "+str(wdistances[y_label==kclass].shape[0]))
histbox(wdistances[y_label==kclass])
CVresult={'w distances':wdistances,'label':y_label}
df = pd.DataFrame(CVresult)
df.sort_values('w distances', axis = 0, ascending = True, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==kclass]
df2.head(6)
#low distances
cols = 4
rows = 1
plt.figure(figsize=(25,15))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class1 low distance "+ str(df2['w distances'][df2.index[i]]),fontsize=20)
plt.imshow(datasetfull[df2.index[i]], plt.cm.gray)
#High distances
cols = 4
rows = 1
plt.figure(figsize=(25,15))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class1 high distance "+ str(df2['w distances'][df2.index[-(i+1)]]),fontsize=20)
plt.imshow(datasetfull[df2.index[-(i+1)]], plt.cm.gray)
kclass=1
print("Number images: "+str(wdistances[y_label==kclass].shape[0]))
histbox(wdistances[y_label==kclass])
CVresult={'w distances':wdistances,'label':y_label}
df = pd.DataFrame(CVresult)
df.sort_values('w distances', axis = 0, ascending = True, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==kclass]
df2.head(6)
#low distances
cols = 4
rows = 1
plt.figure(figsize=(25,15))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class2 low distance "+ str(df2['w distances'][df2.index[i]]),fontsize=20)
plt.imshow(datasetfull[df2.index[i]], plt.cm.gray)
#High distances
cols = 4
rows = 1
plt.figure(figsize=(25,15))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class2 high distance "+ str(df2['w distances'][df2.index[-(i+1)]]),fontsize=20)
plt.imshow(datasetfull[df2.index[-(i+1)]], plt.cm.gray)
kclass=2
print("Number images: "+str(wdistances[y_label==kclass].shape[0]))
histbox(wdistances[y_label==kclass])
CVresult={'w distances':wdistances,'label':y_label}
df = pd.DataFrame(CVresult)
df.sort_values('w distances', axis = 0, ascending = True, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==kclass]
df2.head(6)
#low distances
cols = 4
rows = 1
plt.figure(figsize=(25,15))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class3 low distance "+ str(df2['w distances'][df2.index[i]]),fontsize=20)
plt.imshow(datasetfull[df2.index[i]], plt.cm.gray)
#High distances
cols = 4
rows = 1
plt.figure(figsize=(25,15))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class3 high distance "+ str(df2['w distances'][df2.index[-(i+1)]]),fontsize=20)
plt.imshow(datasetfull[df2.index[-(i+1)]], plt.cm.gray)
y_true=np.ones(faces94_male.shape[0])*2
y_true=np.append(y_true,np.ones(faces94_female.shape[0]))
y_true=np.append(y_true,np.ones(faces94_malestaff.shape[0])*2)
y_true=np.append(y_true,np.zeros(landscapes.shape[0]))
cm=confusion_matrix(y_true, y_label).ravel()
plt.figure()
plt.title("Heatmap")
prediction_data = {'y_Actual': y_true,'y_Predicted': y_label}
df = pd.DataFrame(prediction_data, columns=['y_Actual','y_Predicted'])
confusionmatrix1 = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
ax=sns.heatmap(confusionmatrix1, annot=True,cmap='Blues', fmt='.0f');
ax.xaxis.set_ticklabels(['landscape', 'female','male']); ax.yaxis.set_ticklabels(['landscape', 'female','male']);
ax.invert_yaxis()
print('accuracy= ', (cm[0]+cm[4]+cm[8])/np.sum(cm))
print('accuracy= ', cm[0]/y_true[y_true==0].shape[0],cm[4]/y_true[y_true==1].shape[0],cm[8]/y_true[y_true==2].shape[0])
kmeansk, NormEigenvectorsAk, X_train, X_test, y_predk= kmeansplit(datasetfull,y_label,datasetfull_N,height,width,representation_percentage,3)
print("Number images Test: "+str(y_predk.shape[0]))
print("Number images class 1: "+str(y_predk[y_predk==0].shape[0]))
print("Number images class 2: "+str(y_predk[y_predk==1].shape[0]))
print("Number images class 3: "+str(y_predk[y_predk==2].shape[0]))
N_image= np.random.randint(0, high=X_test.shape[0], size=1)[0]
print('Demo: predict new image as class '+ str(y_predk[N_image]+1))
Image=X_test[N_image]#seleccionar imagen individual
fig = plt.figure(figsize=(8,10))
ax1 = fig.add_subplot(1,2,1)
plt.title("Image")
ax1.imshow(Image, plt.cm.gray)
ax2 = fig.add_subplot(1,2,2)
plt.title("class "+str(y_predk[N_image]+1))
ax2.imshow((np.dot(kmeansk.cluster_centers_[y_predk[N_image]],NormEigenvectorsAk.T)+np.mean(X_train.reshape(X_train.shape[0], height*width), axis=0)).reshape(height, width), plt.cm.gray)
dataclass1=faces94_female
dataset_N2, height, width = dataclass1.shape
mean_class1 = np.mean(dataclass1.reshape(dataset_N2, height*width), axis=0)
plt.imshow(mean_class1.reshape(height, width), plt.cm.gray)
omegaclass1=mean_class1 - np.mean(dataset.reshape(dataset_N, height*width), axis=0)
wclass1=np.dot(omegaclass1,NormEigenvectorsA)#pesos w de cada Eigenface en subespacio generado
Reconstructedc1=np.dot(wclass1,NormEigenvectorsA.T)+mean_all.reshape(height*width)#es mas claro w*vectores propios transpuestos
fig = plt.figure(figsize=(8,10))
ax1 = fig.add_subplot(1,2,1)
plt.title("Female mean")
ax1.imshow(mean_class1.reshape(height, width), plt.cm.gray)
ax2 = fig.add_subplot(1,2,2)
plt.title("Reconstructed Female mean")
ax2.imshow(Reconstructedc1.reshape(height, width), plt.cm.gray)
class1=dataclass1.reshape(dataset_N2, height*width)-mean_all.reshape(height*width)
wclass1total=np.dot(class1,NormEigenvectorsA)
print(wclass1total.shape)
print(wclass1.shape)
Norm=2
edistanceclass1 = np.linalg.norm(np.subtract(wclass1total, wclass1), ord=Norm, axis=1)
print(edistanceclass1.shape)
print(wclass1.shape)
import matplotlib.pyplot as plt
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
plt.title('Histogram')
plt.grid(True)
plt.hist(edistanceclass1);
plt.subplot(1,2,2)
plt.title('Boxplot')
plt.boxplot(edistanceclass1, 0, 'rs', 0);
plt.show()
quartile_1, quartile_3 = np.percentile(edistanceclass1, [25, 75])
image_index = np.random.randint(0, high=dataset_N, size=1)[0]
Imagetest=dataset[image_index].reshape(height*width) - np.mean(dataset.reshape(dataset_N, height*width), axis=0)
wtest=np.dot(Imagetest,NormEigenvectorsA)
Norm=2
test = np.linalg.norm(np.subtract(wtest, wclass1), ord=Norm, axis=0)
if test>quartile_3:
print('no es mujer')
else:
print('es mujer')
print(quartile_3)
print(test)
plt.imshow(dataset[image_index], plt.cm.gray)
Norm=2
edistanceclass1t = np.linalg.norm(np.subtract(np.dot(data,NormEigenvectorsA), wclass1), ord=Norm, axis=1)
print(edistanceclass1t.shape)
print(wclass1.shape)
import matplotlib.pyplot as plt
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
plt.title('Histogram')
plt.grid(True)
plt.hist(edistanceclass1t);
plt.subplot(1,2,2)
plt.title('Boxplot')
plt.boxplot(edistanceclass1t, 0, 'rs', 0);
plt.show()